{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "228bfca6-72ee-4656-b7c4-7694ab085518",
   "metadata": {
    "papermill": {
     "duration": 0.015626,
     "end_time": "2021-10-20T19:08:27.084676",
     "exception": false,
     "start_time": "2021-10-20T19:08:27.069050",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Pulls Codenet classification data from the ml-exchange.org"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "excess-grass",
   "metadata": {
    "papermill": {
     "duration": 0.00612,
     "end_time": "2021-10-20T19:08:27.101041",
     "exception": false,
     "start_time": "2021-10-20T19:08:27.094921",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Pulls Codenet classification data.zip from the ml-exchange.org in a form ready for text classification in the folowing format:\n",
    "\n",
    "zip_root/data/train/*language_name*/**code_sample_file_id**   \n",
    "zip_root/data/test/*language_nam*/**code_sample_file_id**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "assigned-lottery",
   "metadata": {
    "papermill": {
     "duration": 3.501232,
     "end_time": "2021-10-20T19:08:30.607438",
     "exception": false,
     "start_time": "2021-10-20T19:08:27.106206",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "!pip3 install wget==3.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "impaired-sharing",
   "metadata": {
    "papermill": {
     "duration": 0.018958,
     "end_time": "2021-10-20T19:08:30.642874",
     "exception": false,
     "start_time": "2021-10-20T19:08:30.623916",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import wget\n",
    "import logging\n",
    "import os\n",
    "import re\n",
    "import shutil\n",
    "import sys\n",
    "import tarfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "local-gather",
   "metadata": {
    "papermill": {
     "duration": 0.012791,
     "end_time": "2021-10-20T19:08:30.662275",
     "exception": false,
     "start_time": "2021-10-20T19:08:30.649484",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# file name for training data zip\n",
    "output_filename = os.environ.get('output_filename', 'data.zip')\n",
    "\n",
    "# temporal data storage for local execution\n",
    "data_dir = os.environ.get('data_dir', '../../data/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1cd43db-f550-4bc9-8353-e73bbf1e39ad",
   "metadata": {
    "papermill": {
     "duration": 0.013218,
     "end_time": "2021-10-20T19:08:30.681670",
     "exception": false,
     "start_time": "2021-10-20T19:08:30.668452",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "parameters = list(\n",
    "    map(lambda s: re.sub('$', '\"', s),\n",
    "        map(\n",
    "            lambda s: s.replace('=', '=\"'),\n",
    "            filter(\n",
    "                lambda s: s.find('=') > -1 and bool(re.match(r'[A-Za-z0-9_]*=[.\\/A-Za-z0-9]*', s)),\n",
    "                sys.argv\n",
    "            )\n",
    "    )))\n",
    "\n",
    "for parameter in parameters:\n",
    "    logging.warning('Parameter: ' + parameter)\n",
    "    exec(parameter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "widespread-ghana",
   "metadata": {
    "papermill": {
     "duration": 0.142712,
     "end_time": "2021-10-20T19:08:30.830642",
     "exception": false,
     "start_time": "2021-10-20T19:08:30.687930",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "wget.download('https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/Project_CodeNet_LangClass.tar.gz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa7ef912-14cb-4e97-b288-38c35fbbf5ff",
   "metadata": {
    "papermill": {
     "duration": 0.020472,
     "end_time": "2021-10-20T19:08:30.859919",
     "exception": false,
     "start_time": "2021-10-20T19:08:30.839447",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "if os.path.exists('data'):\n",
    "    shutil.rmtree('data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e5949af-eeed-46b4-bfe6-e16f0ba52a8d",
   "metadata": {
    "papermill": {
     "duration": 0.138615,
     "end_time": "2021-10-20T19:08:31.016636",
     "exception": false,
     "start_time": "2021-10-20T19:08:30.878021",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "with tarfile.open('Project_CodeNet_LangClass.tar.gz') as tf:\n",
    "    tf.extractall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f366a1f7-3b7b-4366-b39d-8361d7b90e0d",
   "metadata": {
    "papermill": {
     "duration": 0.17427,
     "end_time": "2021-10-20T19:08:31.198222",
     "exception": false,
     "start_time": "2021-10-20T19:08:31.023952",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "shutil.make_archive(data_dir + output_filename.split('.zip')[0], 'zip', 'data')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 5.366235,
   "end_time": "2021-10-20T19:08:31.515871",
   "environment_variables": {},
   "exception": null,
   "input_path": "/home/jovyan/work/claimed/component-library/input/input-codenet-LangClass.ipynb",
   "output_path": "/home/jovyan/work/claimed/component-library/input/input-codenet-LangClass.ipynb",
   "parameters": {},
   "start_time": "2021-10-20T19:08:26.149636",
   "version": "2.3.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
